{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "import pandas as pd\n",
    "# 导入数据\n",
    "Knowledge = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\Knowledge.xlsx')\n",
    "# 返回前5行数据\n",
    "Knowledge.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构造训练集和测试集\n",
    "# 导入第三方模块\n",
    "from sklearn import model_selection\n",
    "# 将数据集拆分为训练集和测试集\n",
    "predictors = Knowledge.columns[:-1]\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(Knowledge[predictors], Knowledge.UNS, \n",
    "                                                                    test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "import numpy as np\n",
    "from sklearn import neighbors\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 设置待测试的不同k值\n",
    "K = np.arange(1,np.ceil(np.log2(Knowledge.shape[0])))\n",
    "# 构建空的列表，用于存储平均准确率\n",
    "accuracy = []\n",
    "for k in K:\n",
    "    # 使用10重交叉验证的方法，比对每一个k值下KNN模型的预测准确率\n",
    "    cv_result = model_selection.cross_val_score(neighbors.KNeighborsClassifier(n_neighbors = k, weights = 'distance'), \n",
    "                                                X_train, y_train, cv = 10, scoring='accuracy')\n",
    "    accuracy.append(cv_result.mean())\n",
    "\n",
    "# 从k个平均准确率中挑选出最大值所对应的下标    \n",
    "arg_max = np.array(accuracy).argmax()\n",
    "# 中文和负号的正常显示\n",
    "plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "# 绘制不同K值与平均预测准确率之间的折线图\n",
    "plt.plot(K, accuracy)\n",
    "# 添加点图\n",
    "plt.scatter(K, accuracy)\n",
    "# 添加文字说明\n",
    "plt.text(K[arg_max], accuracy[arg_max], '最佳k值为%s' %int(K[arg_max]))\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import metrics\n",
    "\n",
    "# 重新构建模型，并将最佳的近邻个数设置为6\n",
    "knn_class = neighbors.KNeighborsClassifier(n_neighbors = 6, weights = 'distance')\n",
    "# 模型拟合\n",
    "knn_class.fit(X_train, y_train)\n",
    "# 模型在测试数据集上的预测\n",
    "predict = knn_class.predict(X_test)\n",
    "# 构建混淆矩阵\n",
    "cm = pd.crosstab(predict,y_test)\n",
    "cm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "import seaborn as sns\n",
    "\n",
    "# 将混淆矩阵构造成数据框，并加上字段名和行名称，用于行或列的含义说明\n",
    "cm = pd.DataFrame(cm)\n",
    "# 绘制热力图\n",
    "sns.heatmap(cm, annot = True,cmap = 'GnBu')\n",
    "# 添加x轴和y轴的标签\n",
    "plt.xlabel(' Real Lable')\n",
    "plt.ylabel(' Predict Lable')\n",
    "# 图形显示\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 模型整体的预测准确率\n",
    "metrics.scorer.accuracy_score(y_test, predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 分类模型的评估报告\n",
    "print(metrics.classification_report(y_test, predict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 读入数据\n",
    "ccpp = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\CCPP.xlsx')\n",
    "ccpp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 返回数据集的行数与列数\n",
    "ccpp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "from sklearn.preprocessing import minmax_scale\n",
    "# 对所有自变量数据作标准化处理\n",
    "predictors = ccpp.columns[:-1]\n",
    "X = minmax_scale(ccpp[predictors])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 将数据集拆分为训练集和测试集\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, ccpp.PE, \n",
    "                                                                    test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 设置待测试的不同k值\n",
    "K = np.arange(1,np.ceil(np.log2(ccpp.shape[0])))\n",
    "# 构建空的列表，用于存储平均MSE\n",
    "mse = []\n",
    "for k in K:\n",
    "    # 使用10重交叉验证的方法，比对每一个k值下KNN模型的计算MSE\n",
    "    cv_result = model_selection.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors = k, weights = 'distance'), \n",
    "                                                X_train, y_train, cv = 10, scoring='neg_mean_squared_error')\n",
    "    mse.append((-1*cv_result).mean())\n",
    "\n",
    "# 从k个平均MSE中挑选出最小值所对应的下标  \n",
    "arg_min = np.array(mse).argmin()\n",
    "# 绘制不同K值与平均MSE之间的折线图\n",
    "plt.plot(K, mse)\n",
    "# 添加点图\n",
    "plt.scatter(K, mse)\n",
    "# 添加文字说明\n",
    "plt.text(K[arg_min], mse[arg_min] + 0.5, '最佳k值为%s' %int(K[arg_min]))\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 重新构建模型，并将最佳的近邻个数设置为7\n",
    "knn_reg = neighbors.KNeighborsRegressor(n_neighbors = 7, weights = 'distance')\n",
    "# 模型拟合\n",
    "knn_reg.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "predict = knn_reg.predict(X_test)\n",
    "# 计算MSE值\n",
    "metrics.mean_squared_error(y_test, predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 对比真实值和实际值\n",
    "pd.DataFrame({'Real':y_test,'Predict':predict}, columns=['Real','Predict']).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import tree\n",
    "\n",
    "# 预设各参数的不同选项值\n",
    "max_depth = [19,21,23,25,27]\n",
    "min_samples_split = [2,4,6,8]\n",
    "min_samples_leaf = [2,4,8,10,12]\n",
    "parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}\n",
    "# 网格搜索法，测试不同的参数值\n",
    "grid_dtreg = model_selection.GridSearchCV(estimator = tree.DecisionTreeRegressor(), param_grid = parameters, cv=10)\n",
    "# 模型拟合\n",
    "grid_dtreg.fit(X_train, y_train)\n",
    "# 返回最佳组合的参数值\n",
    "grid_dtreg.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构建用于回归的决策树\n",
    "CART_Reg = tree.DecisionTreeRegressor(max_depth = 21, min_samples_leaf = 10, min_samples_split = 6)\n",
    "# 回归树拟合\n",
    "CART_Reg.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "pred = CART_Reg.predict(X_test)\n",
    "# 计算衡量模型好坏的MSE值\n",
    "metrics.mean_squared_error(y_test, pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
